In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import plotly.io as pio
pio.renderers.default = 'notebook'
In [50]:
df = pd.read_csv('DajuBhaiMinimart.csv')
df
Out[50]:
CustomerID CustomerName Quantity Price (NPR) Sales (NPR) Channel Ratings ProductCategory PaymentMethod TransactionDate StoreLocation DiscountApplied (NPR) LoyaltyMember CustomerContact Feedback Gender
0 CUST001 Bishal Gurung 11 1303 17199 Website 4 Electronics Card 12/6/2023 5:13 - 60 Yes 9819894305 Quick delivery Male
1 CUST002 Sita Tamang 10 610 7246 Market Visit 4 Clothing Mobile Payment 12/22/2023 12:54 Pokhara 240 Yes 9823584905 Satisfied with quality Female
2 CUST003 Sarita Lama 7 1812 12684 Market Visit 3 Electronics Mobile Payment 10/13/2024 1:03 Lalitpur 200 No 9823088823 Friendly staff Female
3 CUST004 Ram Shrestha 3 1070 3210 Website 1 Electronics Mobile Payment 6/5/2024 6:08 - 100 No 9888996168 Quick delivery Male
4 CUST005 Sita Tamang 10 1209 14362 Market Visit 5 Personal Care Card 10/26/2024 16:28 Bhaktapur 60 Yes 9874939195 Prices a bit high Female
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
295 CUST296 Kamal Sharma 6 919 5514 Website 2 Personal Care Mobile Payment 4/23/2024 5:18 - 50 No 9835949686 Satisfied with quality Male
296 CUST297 Asha Rai 7 685 4795 Market Visit 3 Groceries Mobile Payment 10/5/2024 5:05 Lalitpur 0 No 9869059335 Satisfied with quality Male
297 CUST298 Kamal Sharma 8 342 2736 Market Visit 1 Electronics Card 12/1/2023 8:10 Pokhara 200 No 9866858364 Prices a bit high Male
298 CUST299 Gurax Gurung 11 217 2864 Market Visit 4 Household Cash 1/21/2024 2:00 Pokhara 60 Yes 9816458335 Friendly staff Female
299 CUST300 Sarita Lama 9 1644 17360 Market Visit 3 Clothing Cash 11/12/2023 14:47 Lalitpur 0 Yes 9893314124 Prices a bit high Female

300 rows × 16 columns

Data Cleaning¶

In [51]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   CustomerID             300 non-null    object
 1   CustomerName           300 non-null    object
 2   Quantity               300 non-null    int64 
 3   Price (NPR)            300 non-null    int64 
 4   Sales (NPR)            300 non-null    int64 
 5   Channel                300 non-null    object
 6   Ratings                300 non-null    int64 
 7   ProductCategory        300 non-null    object
 8   PaymentMethod          300 non-null    object
 9   TransactionDate        300 non-null    object
 10  StoreLocation          300 non-null    object
 11  DiscountApplied (NPR)  300 non-null    int64 
 12  LoyaltyMember          300 non-null    object
 13  CustomerContact        300 non-null    int64 
 14  Feedback               300 non-null    object
 15  Gender                 300 non-null    object
dtypes: int64(6), object(10)
memory usage: 37.6+ KB
In [52]:
df.isnull().sum()
Out[52]:
CustomerID               0
CustomerName             0
Quantity                 0
Price (NPR)              0
Sales (NPR)              0
Channel                  0
Ratings                  0
ProductCategory          0
PaymentMethod            0
TransactionDate          0
StoreLocation            0
DiscountApplied (NPR)    0
LoyaltyMember            0
CustomerContact          0
Feedback                 0
Gender                   0
dtype: int64
In [53]:
# drop customer_id column
df.drop('CustomerID', axis=1, inplace=True)
In [54]:
#rename the column name
df = df.rename(columns={'Price (NPR)': 'Price'})
df = df.rename(columns={'Sales (NPR)': 'Sales'})
df = df.rename(columns={'DiscountApplied (NPR)': 'DiscountApplied'})
In [55]:
df
Out[55]:
CustomerName Quantity Price Sales Channel Ratings ProductCategory PaymentMethod TransactionDate StoreLocation DiscountApplied LoyaltyMember CustomerContact Feedback Gender
0 Bishal Gurung 11 1303 17199 Website 4 Electronics Card 12/6/2023 5:13 - 60 Yes 9819894305 Quick delivery Male
1 Sita Tamang 10 610 7246 Market Visit 4 Clothing Mobile Payment 12/22/2023 12:54 Pokhara 240 Yes 9823584905 Satisfied with quality Female
2 Sarita Lama 7 1812 12684 Market Visit 3 Electronics Mobile Payment 10/13/2024 1:03 Lalitpur 200 No 9823088823 Friendly staff Female
3 Ram Shrestha 3 1070 3210 Website 1 Electronics Mobile Payment 6/5/2024 6:08 - 100 No 9888996168 Quick delivery Male
4 Sita Tamang 10 1209 14362 Market Visit 5 Personal Care Card 10/26/2024 16:28 Bhaktapur 60 Yes 9874939195 Prices a bit high Female
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
295 Kamal Sharma 6 919 5514 Website 2 Personal Care Mobile Payment 4/23/2024 5:18 - 50 No 9835949686 Satisfied with quality Male
296 Asha Rai 7 685 4795 Market Visit 3 Groceries Mobile Payment 10/5/2024 5:05 Lalitpur 0 No 9869059335 Satisfied with quality Male
297 Kamal Sharma 8 342 2736 Market Visit 1 Electronics Card 12/1/2023 8:10 Pokhara 200 No 9866858364 Prices a bit high Male
298 Gurax Gurung 11 217 2864 Market Visit 4 Household Cash 1/21/2024 2:00 Pokhara 60 Yes 9816458335 Friendly staff Female
299 Sarita Lama 9 1644 17360 Market Visit 3 Clothing Cash 11/12/2023 14:47 Lalitpur 0 Yes 9893314124 Prices a bit high Female

300 rows × 15 columns

In [56]:
df.head()
Out[56]:
CustomerName Quantity Price Sales Channel Ratings ProductCategory PaymentMethod TransactionDate StoreLocation DiscountApplied LoyaltyMember CustomerContact Feedback Gender
0 Bishal Gurung 11 1303 17199 Website 4 Electronics Card 12/6/2023 5:13 - 60 Yes 9819894305 Quick delivery Male
1 Sita Tamang 10 610 7246 Market Visit 4 Clothing Mobile Payment 12/22/2023 12:54 Pokhara 240 Yes 9823584905 Satisfied with quality Female
2 Sarita Lama 7 1812 12684 Market Visit 3 Electronics Mobile Payment 10/13/2024 1:03 Lalitpur 200 No 9823088823 Friendly staff Female
3 Ram Shrestha 3 1070 3210 Website 1 Electronics Mobile Payment 6/5/2024 6:08 - 100 No 9888996168 Quick delivery Male
4 Sita Tamang 10 1209 14362 Market Visit 5 Personal Care Card 10/26/2024 16:28 Bhaktapur 60 Yes 9874939195 Prices a bit high Female
In [57]:
df['ProductCategory'].unique()
Out[57]:
array(['Electronics', 'Clothing', 'Personal Care', 'Groceries',
       'Household'], dtype=object)
In [58]:
df
Out[58]:
CustomerName Quantity Price Sales Channel Ratings ProductCategory PaymentMethod TransactionDate StoreLocation DiscountApplied LoyaltyMember CustomerContact Feedback Gender
0 Bishal Gurung 11 1303 17199 Website 4 Electronics Card 12/6/2023 5:13 - 60 Yes 9819894305 Quick delivery Male
1 Sita Tamang 10 610 7246 Market Visit 4 Clothing Mobile Payment 12/22/2023 12:54 Pokhara 240 Yes 9823584905 Satisfied with quality Female
2 Sarita Lama 7 1812 12684 Market Visit 3 Electronics Mobile Payment 10/13/2024 1:03 Lalitpur 200 No 9823088823 Friendly staff Female
3 Ram Shrestha 3 1070 3210 Website 1 Electronics Mobile Payment 6/5/2024 6:08 - 100 No 9888996168 Quick delivery Male
4 Sita Tamang 10 1209 14362 Market Visit 5 Personal Care Card 10/26/2024 16:28 Bhaktapur 60 Yes 9874939195 Prices a bit high Female
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
295 Kamal Sharma 6 919 5514 Website 2 Personal Care Mobile Payment 4/23/2024 5:18 - 50 No 9835949686 Satisfied with quality Male
296 Asha Rai 7 685 4795 Market Visit 3 Groceries Mobile Payment 10/5/2024 5:05 Lalitpur 0 No 9869059335 Satisfied with quality Male
297 Kamal Sharma 8 342 2736 Market Visit 1 Electronics Card 12/1/2023 8:10 Pokhara 200 No 9866858364 Prices a bit high Male
298 Gurax Gurung 11 217 2864 Market Visit 4 Household Cash 1/21/2024 2:00 Pokhara 60 Yes 9816458335 Friendly staff Female
299 Sarita Lama 9 1644 17360 Market Visit 3 Clothing Cash 11/12/2023 14:47 Lalitpur 0 Yes 9893314124 Prices a bit high Female

300 rows × 15 columns

In [59]:
# Calculate additional columns
df['TotalDiscount'] = df['DiscountApplied'] * df['Quantity']
df['CustomerFrequency'] = df.groupby('CustomerName')['CustomerName'].transform('count')
df['AverageRatingPerCustomer'] = df.groupby('CustomerName')['Ratings'].transform('mean')
df['TotalSalesPerCustomer'] = df.groupby('CustomerName')['Sales'].transform('sum')

# Extract date features before dropping TransactionDate
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['IsWeekend'] = df['TransactionDate'].dt.dayofweek >= 5
df['IsWeekend'] = df['IsWeekend'].astype(int)
In [60]:
# Extract date features
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['Year'] = df['TransactionDate'].dt.year
df['Month'] = df['TransactionDate'].dt.month
df['Day'] = df['TransactionDate'].dt.day
df['DayOfWeek'] = df['TransactionDate'].dt.dayofweek
In [61]:
# Map day of the week numbers to names
day_of_week_map = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df['DayOfWeek'] = df['DayOfWeek'].map(day_of_week_map)
In [62]:
# Drop unnecessary columns
df.drop(columns=['CustomerName', 'CustomerContact', 'Feedback'], inplace=True)
In [63]:
df
Out[63]:
Quantity Price Sales Channel Ratings ProductCategory PaymentMethod TransactionDate StoreLocation DiscountApplied ... Gender TotalDiscount CustomerFrequency AverageRatingPerCustomer TotalSalesPerCustomer IsWeekend Year Month Day DayOfWeek
0 11 1303 17199 Website 4 Electronics Card 2023-12-06 05:13:00 - 60 ... Male 660 49 3.693878 371058 0 2023 12 6 Wednesday
1 10 610 7246 Market Visit 4 Clothing Mobile Payment 2023-12-22 12:54:00 Pokhara 240 ... Female 2400 47 3.361702 341186 0 2023 12 22 Friday
2 7 1812 12684 Market Visit 3 Electronics Mobile Payment 2024-10-13 01:03:00 Lalitpur 200 ... Female 1400 42 3.380952 278411 1 2024 10 13 Sunday
3 3 1070 3210 Website 1 Electronics Mobile Payment 2024-06-05 06:08:00 - 100 ... Male 300 43 3.372093 211555 0 2024 6 5 Wednesday
4 10 1209 14362 Market Visit 5 Personal Care Card 2024-10-26 16:28:00 Bhaktapur 60 ... Female 600 47 3.361702 341186 1 2024 10 26 Saturday
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
295 6 919 5514 Website 2 Personal Care Mobile Payment 2024-04-23 05:18:00 - 50 ... Male 300 47 3.531915 383490 0 2024 4 23 Tuesday
296 7 685 4795 Market Visit 3 Groceries Mobile Payment 2024-10-05 05:05:00 Lalitpur 0 ... Male 0 36 3.361111 243614 1 2024 10 5 Saturday
297 8 342 2736 Market Visit 1 Electronics Card 2023-12-01 08:10:00 Pokhara 200 ... Male 1600 47 3.531915 383490 0 2023 12 1 Friday
298 11 217 2864 Market Visit 4 Household Cash 2024-01-21 02:00:00 Pokhara 60 ... Female 660 1 4.000000 2864 1 2024 1 21 Sunday
299 9 1644 17360 Market Visit 3 Clothing Cash 2023-11-12 14:47:00 Lalitpur 0 ... Female 0 42 3.380952 278411 1 2023 11 12 Sunday

300 rows × 21 columns

Data Visualization¶

In [64]:
# Group by Gender and count the number of visits
gender_distribution = df['Gender'].value_counts().reset_index()
gender_distribution.columns = ['Gender', 'Count']

# Create a pie chart
fig = px.pie(gender_distribution, 
             values='Count', 
             names='Gender', 
             title='Gender Distribution of Store Visitors')
fig.update_layout(width=800, height=600)

fig.show()
In [65]:
# visualizations based on Total Sales by Product Category plotly
total_sales_by_product_category = df.groupby('ProductCategory')['Sales'].sum().reset_index()
total_sales_by_product_category = total_sales_by_product_category.sort_values(by='Sales', ascending=False)
#bar
fig = px.bar(total_sales_by_product_category, x='ProductCategory', y='Sales',color='ProductCategory', title='Total Sales by Product Category')
fig.update_layout(xaxis_title="Product Category", yaxis_title="Total Sales")
fig.show()
In [66]:
# visualization based average rating by product category
average_rating_by_product_category = df.groupby('ProductCategory')['Ratings'].mean().reset_index()
average_rating_by_product_category = average_rating_by_product_category.sort_values(by='Ratings', ascending=False)

#pie chart
fig = px.pie(average_rating_by_product_category, names='ProductCategory', values='Ratings', title='Average Rating by Product Category')
fig.show()
In [67]:
#total Sales by payment method
total_sales_by_payment_method = df.groupby('PaymentMethod')['Sales'].sum().reset_index()
total_sales_by_payment_method = total_sales_by_payment_method.sort_values(by='Sales', ascending=False)

# histogram
fig = px.histogram(df, x='PaymentMethod', y='Sales', color='PaymentMethod', title='Total Sales by Payment Method', histfunc='sum')
fig.update_layout(xaxis_title="Payment Method", yaxis_title="Total Sales")
fig.show()
In [68]:
#total sales by channel
total_sales_by_channel = df.groupby('Channel')['Sales'].sum().reset_index()
total_sales_by_channel = total_sales_by_channel.sort_values(by='Sales', ascending=False)

#bar plotly
fig = px.bar(total_sales_by_channel, x='Channel', y='Sales',color='Sales',color_continuous_scale='temps', title='TotalSales by Channel')
fig.update_layout(xaxis_title="Channel", yaxis_title="Total Sales")
fig.show()
In [69]:
#total sales by day of the week
total_sales_by_day = df.groupby('DayOfWeek')['Sales'].sum().reset_index()
days_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
total_sales_by_day['DayOfWeek'] = pd.Categorical(total_sales_by_day['DayOfWeek'], categories=days_order, ordered=True)
total_sales_by_day = total_sales_by_day.sort_values('DayOfWeek')

fig = px.bar(total_sales_by_day, x='DayOfWeek', y='Sales', title='Total Sales by Day of the Week', color='Sales', color_continuous_scale='temps')
fig.update_layout(xaxis_title="Day of the Week", yaxis_title="Total Sales (NPR)")
fig.show()
In [70]:
#visualizations based on Total Sales by Month
total_sales_by_month = df.groupby('Month')['Sales'].sum().reset_index()
fig = px.scatter(total_sales_by_month, x='Month', y='Sales', title='Total Sales by Month',color='Sales',color_continuous_scale='matter')
fig.show()
In [71]:
# Calculate average discount applied by product category
average_discount_by_category = df.groupby('ProductCategory')['DiscountApplied'].mean().reset_index()

# Create a line graph
fig = px.line(average_discount_by_category, x='ProductCategory', y='DiscountApplied', title='Average Discount Applied by Product Category')
fig.update_layout(xaxis_title="Product Category", yaxis_title="Average Discount Applied")
fig.show()
In [72]:
#total sales by gender
sales_by_gender = df.groupby('Gender')['Sales'].sum().reset_index()
#pie
fig = px.pie(sales_by_gender, values='Sales', names='Gender', title='Total Sales by Gender',color='Gender')
fig.show()
In [73]:
# Visualize the total quantity sold by product category
quantity_by_category = df.groupby('ProductCategory')['Quantity'].sum().reset_index()

fig = px.bar(quantity_by_category, 
                 x='ProductCategory', 
                 y='Quantity', 
                 title='Total Quantity Sold by Product Category',
                 color='Quantity',
                 color_continuous_scale='matter',
)
# Update the layout to make the plot bigger
fig.update_layout(width=800, height=600)

fig.show()

Feature Engineering¶

In [74]:
#loyality member 
df['LoyaltyMember'] = df['LoyaltyMember'].map({
    'Yes': '1',
    'No': '0'
})
In [75]:
#product Category mapping
df['ProductCategory'] = df['ProductCategory'].map({
    'Electronics': '0',
    'Clothing': '1',
    'Groceries': '2',
    'Personal Care': '3',
    'Household': '4'
})
In [76]:
#mapping channel name
df['Channel'] = df['Channel'].map({
    'Website': '0',
    'Market Visit': '1'
})
df['Channel'] = pd.to_numeric(df['Channel'], errors='coerce')
In [77]:
# payment method mapping
df['PaymentMethod'] = df['PaymentMethod'].map({
    'Card': '0',
    'Mobile Payment': '1',
    'Cash': '2'
})
df.head()
Out[77]:
Quantity Price Sales Channel Ratings ProductCategory PaymentMethod TransactionDate StoreLocation DiscountApplied ... Gender TotalDiscount CustomerFrequency AverageRatingPerCustomer TotalSalesPerCustomer IsWeekend Year Month Day DayOfWeek
0 11 1303 17199 0 4 0 0 2023-12-06 05:13:00 - 60 ... Male 660 49 3.693878 371058 0 2023 12 6 Wednesday
1 10 610 7246 1 4 1 1 2023-12-22 12:54:00 Pokhara 240 ... Female 2400 47 3.361702 341186 0 2023 12 22 Friday
2 7 1812 12684 1 3 0 1 2024-10-13 01:03:00 Lalitpur 200 ... Female 1400 42 3.380952 278411 1 2024 10 13 Sunday
3 3 1070 3210 0 1 0 1 2024-06-05 06:08:00 - 100 ... Male 300 43 3.372093 211555 0 2024 6 5 Wednesday
4 10 1209 14362 1 5 3 0 2024-10-26 16:28:00 Bhaktapur 60 ... Female 600 47 3.361702 341186 1 2024 10 26 Saturday

5 rows × 21 columns

In [78]:
# Ratings Category (Low, Medium, High)
def categorize_rating(Ratings):
    if Ratings <= 2.5:
        return 'Low'
    elif Ratings <= 4.0:
        return 'Medium'
    else:
        return 'High'

df['Rating_Category'] = df['Ratings'].apply(categorize_rating)
df.head()
Out[78]:
Quantity Price Sales Channel Ratings ProductCategory PaymentMethod TransactionDate StoreLocation DiscountApplied ... TotalDiscount CustomerFrequency AverageRatingPerCustomer TotalSalesPerCustomer IsWeekend Year Month Day DayOfWeek Rating_Category
0 11 1303 17199 0 4 0 0 2023-12-06 05:13:00 - 60 ... 660 49 3.693878 371058 0 2023 12 6 Wednesday Medium
1 10 610 7246 1 4 1 1 2023-12-22 12:54:00 Pokhara 240 ... 2400 47 3.361702 341186 0 2023 12 22 Friday Medium
2 7 1812 12684 1 3 0 1 2024-10-13 01:03:00 Lalitpur 200 ... 1400 42 3.380952 278411 1 2024 10 13 Sunday Medium
3 3 1070 3210 0 1 0 1 2024-06-05 06:08:00 - 100 ... 300 43 3.372093 211555 0 2024 6 5 Wednesday Low
4 10 1209 14362 1 5 3 0 2024-10-26 16:28:00 Bhaktapur 60 ... 600 47 3.361702 341186 1 2024 10 26 Saturday High

5 rows × 22 columns

Model Training¶

In [79]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Quantity                  300 non-null    int64         
 1   Price                     300 non-null    int64         
 2   Sales                     300 non-null    int64         
 3   Channel                   300 non-null    int64         
 4   Ratings                   300 non-null    int64         
 5   ProductCategory           300 non-null    object        
 6   PaymentMethod             300 non-null    object        
 7   TransactionDate           300 non-null    datetime64[ns]
 8   StoreLocation             300 non-null    object        
 9   DiscountApplied           300 non-null    int64         
 10  LoyaltyMember             300 non-null    object        
 11  Gender                    300 non-null    object        
 12  TotalDiscount             300 non-null    int64         
 13  CustomerFrequency         300 non-null    int64         
 14  AverageRatingPerCustomer  300 non-null    float64       
 15  TotalSalesPerCustomer     300 non-null    int64         
 16  IsWeekend                 300 non-null    int64         
 17  Year                      300 non-null    int32         
 18  Month                     300 non-null    int32         
 19  Day                       300 non-null    int32         
 20  DayOfWeek                 300 non-null    object        
 21  Rating_Category           300 non-null    object        
dtypes: datetime64[ns](1), float64(1), int32(3), int64(10), object(7)
memory usage: 48.2+ KB
In [80]:
#data to numeric to columns
df['ProductCategory'] = pd.to_numeric(df['ProductCategory'], errors='coerce')
df['PaymentMethod'] = pd.to_numeric(df['PaymentMethod'], errors='coerce')
df['LoyalityMember'] = pd.to_numeric(df['LoyaltyMember'], errors='coerce')
df.head()
Out[80]:
Quantity Price Sales Channel Ratings ProductCategory PaymentMethod TransactionDate StoreLocation DiscountApplied ... CustomerFrequency AverageRatingPerCustomer TotalSalesPerCustomer IsWeekend Year Month Day DayOfWeek Rating_Category LoyalityMember
0 11 1303 17199 0 4 0 0 2023-12-06 05:13:00 - 60 ... 49 3.693878 371058 0 2023 12 6 Wednesday Medium 1
1 10 610 7246 1 4 1 1 2023-12-22 12:54:00 Pokhara 240 ... 47 3.361702 341186 0 2023 12 22 Friday Medium 1
2 7 1812 12684 1 3 0 1 2024-10-13 01:03:00 Lalitpur 200 ... 42 3.380952 278411 1 2024 10 13 Sunday Medium 0
3 3 1070 3210 0 1 0 1 2024-06-05 06:08:00 - 100 ... 43 3.372093 211555 0 2024 6 5 Wednesday Low 0
4 10 1209 14362 1 5 3 0 2024-10-26 16:28:00 Bhaktapur 60 ... 47 3.361702 341186 1 2024 10 26 Saturday High 1

5 rows × 23 columns

In [81]:
df.info()
df['Channel'].unique()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Quantity                  300 non-null    int64         
 1   Price                     300 non-null    int64         
 2   Sales                     300 non-null    int64         
 3   Channel                   300 non-null    int64         
 4   Ratings                   300 non-null    int64         
 5   ProductCategory           300 non-null    int64         
 6   PaymentMethod             300 non-null    int64         
 7   TransactionDate           300 non-null    datetime64[ns]
 8   StoreLocation             300 non-null    object        
 9   DiscountApplied           300 non-null    int64         
 10  LoyaltyMember             300 non-null    object        
 11  Gender                    300 non-null    object        
 12  TotalDiscount             300 non-null    int64         
 13  CustomerFrequency         300 non-null    int64         
 14  AverageRatingPerCustomer  300 non-null    float64       
 15  TotalSalesPerCustomer     300 non-null    int64         
 16  IsWeekend                 300 non-null    int64         
 17  Year                      300 non-null    int32         
 18  Month                     300 non-null    int32         
 19  Day                       300 non-null    int32         
 20  DayOfWeek                 300 non-null    object        
 21  Rating_Category           300 non-null    object        
 22  LoyalityMember            300 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(13), object(5)
memory usage: 50.5+ KB
Out[81]:
array([0, 1])
In [82]:
df.describe()
Out[82]:
Quantity Price Sales Channel Ratings ProductCategory PaymentMethod TransactionDate DiscountApplied TotalDiscount CustomerFrequency AverageRatingPerCustomer TotalSalesPerCustomer IsWeekend Year Month Day LoyalityMember
count 300.000000 300.000000 300.000000 300.000000 300.000000 300.000000 300.000000 300 300.000000 300.000000 300.000000 300.000000 300.000000 300.000000 300.000000 300.000000 300.000000 300.000000
mean 6.126667 1046.113333 6848.303333 0.470000 3.473333 1.946667 0.946667 2024-05-14 01:22:03.600000 98.366667 603.300000 41.733333 3.473333 291138.376667 0.296667 2023.853333 6.673333 15.666667 0.490000
min 1.000000 107.000000 145.000000 0.000000 1.000000 0.000000 0.000000 2023-11-10 23:24:00 0.000000 0.000000 1.000000 1.000000 1071.000000 0.000000 2023.000000 1.000000 1.000000 0.000000
25% 4.000000 532.000000 2413.250000 0.000000 2.000000 1.000000 0.000000 2024-02-07 20:28:45 50.000000 87.500000 42.000000 3.361702 211555.000000 0.000000 2024.000000 4.000000 7.750000 0.000000
50% 6.000000 1086.000000 5165.500000 0.000000 4.000000 2.000000 1.000000 2024-05-19 12:17:00 100.000000 480.000000 43.000000 3.380952 278411.000000 0.000000 2024.000000 7.000000 16.000000 0.000000
75% 9.000000 1524.250000 10034.000000 1.000000 5.000000 3.000000 2.000000 2024-08-11 11:19:00 120.000000 960.000000 47.000000 3.535714 371058.000000 1.000000 2024.000000 10.000000 24.000000 1.000000
max 11.000000 1998.000000 25819.000000 1.000000 5.000000 4.000000 2.000000 2024-11-04 18:03:00 240.000000 2640.000000 49.000000 5.000000 383490.000000 1.000000 2024.000000 12.000000 31.000000 1.000000
std 2.997874 554.192076 5634.298122 0.499933 1.332369 1.500442 0.824270 NaN 79.662744 593.775497 9.144815 0.235653 83860.974326 0.457552 0.354364 3.465061 9.047379 0.500735
In [83]:
corr = df.corr(numeric_only=1)
corr
Out[83]:
Quantity Price Sales Channel Ratings ProductCategory PaymentMethod DiscountApplied TotalDiscount CustomerFrequency AverageRatingPerCustomer TotalSalesPerCustomer IsWeekend Year Month Day LoyalityMember
Quantity 1.000000 0.012748 0.668159 0.058332 -0.079534 0.029761 0.036580 0.002690 0.482048 0.032955 -0.058428 0.084254 0.001772 -0.010788 -0.024658 -0.051461 0.183539
Price 0.012748 1.000000 0.647220 -0.044857 0.068485 -0.012232 -0.035840 -0.091466 -0.086800 0.027917 0.109680 0.092431 0.022302 -0.021935 0.045250 -0.073137 0.003668
Sales 0.668159 0.647220 1.000000 -0.018217 0.038459 0.004102 0.007956 -0.073233 0.233778 0.103906 0.042509 0.166571 0.035740 -0.033195 0.020542 -0.113691 0.173722
Channel 0.058332 -0.044857 -0.018217 1.000000 0.026411 0.100407 -0.028244 0.169659 0.133675 0.009217 0.011350 0.007948 0.104833 -0.043798 0.061897 0.020704 -0.001202
Ratings -0.079534 0.068485 0.038459 0.026411 1.000000 0.005978 0.044381 0.054888 -0.027304 -0.008820 0.176868 0.019365 -0.011667 -0.015395 -0.032319 -0.055120 0.307897
ProductCategory 0.029761 -0.012232 0.004102 0.100407 0.005978 1.000000 -0.018533 0.026690 0.054105 -0.002746 -0.063846 -0.007227 0.076711 0.054431 -0.022017 -0.019053 0.030448
PaymentMethod 0.036580 -0.035840 0.007956 -0.028244 0.044381 -0.018533 1.000000 0.018533 -0.009548 -0.079983 0.087827 -0.029193 0.033225 -0.015419 -0.132586 0.063982 0.071631
DiscountApplied 0.002690 -0.091466 -0.073233 0.169659 0.054888 0.026690 0.018533 1.000000 0.787155 0.041269 0.027291 0.039173 0.060134 0.049538 -0.106865 0.012745 0.150925
TotalDiscount 0.482048 -0.086800 0.233778 0.133675 -0.027304 0.054105 -0.009548 0.787155 1.000000 0.069091 0.027262 0.091305 0.018420 0.034892 -0.088651 -0.000299 0.182845
CustomerFrequency 0.032955 0.027917 0.103906 0.009217 -0.008820 -0.002746 -0.079983 0.041269 0.069091 1.000000 -0.049870 0.867935 0.002984 -0.058552 0.106904 -0.011709 0.041777
AverageRatingPerCustomer -0.058428 0.109680 0.042509 0.011350 0.176868 -0.063846 0.087827 0.027291 0.027262 -0.049870 1.000000 0.109487 -0.030067 0.015437 -0.013972 0.086413 0.048151
TotalSalesPerCustomer 0.084254 0.092431 0.166571 0.007948 0.019365 -0.007227 -0.029193 0.039173 0.091305 0.867935 0.109487 1.000000 -0.036349 -0.017618 0.049690 0.008862 0.050110
IsWeekend 0.001772 0.022302 0.035740 0.104833 -0.011667 0.076711 0.033225 0.060134 0.018420 0.002984 -0.030067 -0.036349 1.000000 -0.060781 0.002264 -0.025315 0.034888
Year -0.010788 -0.021935 -0.033195 -0.043798 -0.015395 0.054431 -0.015419 0.049538 0.034892 -0.058552 0.015437 -0.017618 -0.060781 1.000000 -0.616585 -0.065372 0.010555
Month -0.024658 0.045250 0.020542 0.061897 -0.032319 -0.022017 -0.132586 -0.106865 -0.088651 0.106904 -0.013972 0.049690 0.002264 -0.616585 1.000000 0.048896 -0.113688
Day -0.051461 -0.073137 -0.113691 0.020704 -0.055120 -0.019053 0.063982 0.012745 -0.000299 -0.011709 0.086413 0.008862 -0.025315 -0.065372 0.048896 1.000000 -0.033221
LoyalityMember 0.183539 0.003668 0.173722 -0.001202 0.307897 0.030448 0.071631 0.150925 0.182845 0.041777 0.048151 0.050110 0.034888 0.010555 -0.113688 -0.033221 1.000000
In [84]:
# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='magma', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
No description has been provided for this image
In [85]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score

Features= df[['Quantity', 'Price','ProductCategory', 'PaymentMethod', 'LoyaltyMember']]
Target = df['Sales']

X_train, X_test, y_train, y_test = train_test_split(Features, Target, test_size=0.2, random_state=42) 
In [100]:
#gradient boosting regressor
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_absolute_error(y_test, y_pred)
r2_reg= r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mse}')
print(f'R-squared: {r2_reg}')
#to view the actual and predicted values
for actual, predicted in zip(y_test, y_pred):
    print(f"Actual: {actual}, Predicted: {int(predicted)}")
Mean Absolute Error: 0.12313124521990503
R-squared: 0.8302825355373844
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
In [87]:
# Visualization of predicted vs actual costs
fig_cost = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y_test.min(), y0=y_test.min(),
    x1=y_test.max(), y1=y_test.max()
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()

Hyperparameter tuning and cross validation for Gradient Boosting Regressor¶

In [ ]:
#hyperparameter tuning and cross validation for gradientboostingRegressor
gbr = GradientBoostingRegressor(
    random_state=42, 
    n_estimators=200,         
    max_depth=3,           
    min_samples_split=2,      
    min_samples_leaf=2, 
    learning_rate=0.1,      
)

gbr.fit(X_train, y_train)

y_pred_reg = gbr.predict(X_test)

cv_scores_r2_reg = cross_val_score(gbr, Features, Target, cv=5, scoring='r2')
cv_scores_mae = cross_val_score(gbr, Features, Target, cv=5, scoring='neg_mean_absolute_error')
cv_scores_mae = -cv_scores_mae 
print("Mean R2 score:", cv_scores_r2_reg.mean())
print("Mean MAE score:", cv_scores_mae.mean())
#to view the actual and predicted values
for actual, predicted in zip(y_test, y_pred_reg):
    print(f"Actual: {actual}, Predicted: {int(predicted)}")
Mean R2 score: 0.9926229878124989
Mean MAE score: 348.9119389870635
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
In [89]:
# Visualization of predicted vs actual costs after hyperparameter tuning
fig_cost = px.scatter(x=y_test, y=y_pred_reg, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y_test.min(), y0=y_test.min(),
    x1=y_test.max(), y1=y_test.max()
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()
In [90]:
#save the model
import joblib
joblib.dump(gbr, 'sales_prediction_model.pkl')
Out[90]:
['sales_prediction_model.pkl']
In [91]:
# for classification of the loyalty member
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Features_clf = df[['Quantity','Sales', 'ProductCategory', 'PaymentMethod', 'DiscountApplied', 'TotalDiscount', 'CustomerFrequency', 'AverageRatingPerCustomer', 'TotalSalesPerCustomer']]
Target_clf = df['LoyaltyMember']

X_train, X_test, y_train, y_test = train_test_split(Features_clf, Target_clf, test_size=0.2, random_state=42)

model_clf = RandomForestClassifier()

model_clf.fit(X_train, y_train)
y_pred_clf = model_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_clf)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred_clf))
Accuracy: 0.8833333333333333
              precision    recall  f1-score   support

           0       0.97      0.83      0.89        35
           1       0.80      0.96      0.87        25

    accuracy                           0.88        60
   macro avg       0.88      0.89      0.88        60
weighted avg       0.90      0.88      0.88        60

In [92]:
# confusion matrix by plotly
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_clf)

# Create a confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=['Predicted Negative', 'Predicted Positive'],
    y=['Actual Negative', 'Actual Positive'],
    hoverongaps=False,
    colorscale='Viridis'))

fig.update_layout(
    title='Confusion Matrix',
    xaxis_title='Predicted Label',
    yaxis_title='True Label'
)

fig.show()
In [93]:
# random forest classifier
from sklearn.tree import plot_tree

num_estimators = len(model_clf.estimators_)
print(f'The model has {num_estimators} estimators.')

plt.figure(figsize=(20, 10))
plot_tree(model_clf.estimators_[0], filled=True, feature_names=Features_clf.columns, class_names=[str(cls) for cls in model_clf.classes_])
plt.show()
The model has 100 estimators.
No description has been provided for this image

Hyperparameter tuning and cross validation for RandomForestClassifier¶

In [94]:
#hyperparameter tuning and cross validation for RandomForestClassifier
rfc = RandomForestClassifier(
    random_state=42, 
    n_estimators=300,         
    max_depth=3,           
    min_samples_split=2,      
    min_samples_leaf=2,       
)

rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

cv_scores_rfc = cross_val_score(rfc, Features_clf, Target_clf, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores for Random Forest Classifier:", cv_scores_rfc)
print("Mean accuracy score:", cv_scores_rfc.mean())
Cross-validation accuracy scores for Random Forest Classifier: [0.93333333 0.7        0.73333333 0.78333333 0.8       ]
Mean accuracy score: 0.79
In [95]:
#to view the actual and predicted values
for actual, predicted in zip(y_test, y_pred_reg):
    print(f"Actual: {actual}, Predicted: {int(predicted)}")
Actual: 0, Predicted: 978
Actual: 1, Predicted: 22402
Actual: 1, Predicted: 2895
Actual: 1, Predicted: 10013
Actual: 0, Predicted: 10082
Actual: 1, Predicted: 7024
Actual: 1, Predicted: 16117
Actual: 1, Predicted: 21932
Actual: 0, Predicted: 7211
Actual: 0, Predicted: 919
Actual: 0, Predicted: 9603
Actual: 1, Predicted: 23343
Actual: 0, Predicted: 2786
Actual: 0, Predicted: 3865
Actual: 0, Predicted: 5421
Actual: 0, Predicted: 4372
Actual: 1, Predicted: 4517
Actual: 0, Predicted: 2125
Actual: 0, Predicted: 2261
Actual: 0, Predicted: 1195
Actual: 1, Predicted: 2196
Actual: 0, Predicted: 11410
Actual: 0, Predicted: 759
Actual: 1, Predicted: 3339
Actual: 0, Predicted: 656
Actual: 1, Predicted: 860
Actual: 0, Predicted: 4539
Actual: 0, Predicted: 5607
Actual: 1, Predicted: 5883
Actual: 0, Predicted: 2907
Actual: 1, Predicted: 14658
Actual: 1, Predicted: 7766
Actual: 1, Predicted: 3138
Actual: 1, Predicted: 1527
Actual: 1, Predicted: 8027
Actual: 0, Predicted: 2877
Actual: 0, Predicted: 3717
Actual: 0, Predicted: 20326
Actual: 0, Predicted: 9513
Actual: 1, Predicted: 3037
Actual: 0, Predicted: 515
Actual: 1, Predicted: 21791
Actual: 0, Predicted: 10407
Actual: 1, Predicted: 2360
Actual: 0, Predicted: 1641
Actual: 0, Predicted: 1439
Actual: 0, Predicted: 706
Actual: 1, Predicted: 3131
Actual: 0, Predicted: 2502
Actual: 1, Predicted: 3677
Actual: 0, Predicted: 13820
Actual: 0, Predicted: 4010
Actual: 1, Predicted: 10658
Actual: 0, Predicted: 8420
Actual: 0, Predicted: 2593
Actual: 0, Predicted: 3445
Actual: 1, Predicted: 1822
Actual: 0, Predicted: 7431
Actual: 0, Predicted: 5725
Actual: 1, Predicted: 2822
In [ ]: